#import Libraries
import plotly.express as ex
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# Read data
bank = pd.read_csv(r"C:\Users\junej\Documents\Credit card churn prediction\BankChurners.csv", encoding='latin1')
bank.head()
uncoded_bank=bank
bank.head(3)
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
3 rows × 21 columns
bank.describe()
| CLIENTNUM | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.012700e+04 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
| mean | 7.391776e+08 | 46.325960 | 2.346203 | 35.928409 | 3.812580 | 2.341167 | 2.455317 | 8631.953698 | 1162.814061 | 7469.139637 | 0.759941 | 4404.086304 | 64.858695 | 0.712222 | 0.274894 |
| std | 3.690378e+07 | 8.016814 | 1.298908 | 7.986416 | 1.554408 | 1.010622 | 1.106225 | 9088.776650 | 814.987335 | 9090.685324 | 0.219207 | 3397.129254 | 23.472570 | 0.238086 | 0.275691 |
| min | 7.080821e+08 | 26.000000 | 0.000000 | 13.000000 | 1.000000 | 0.000000 | 0.000000 | 1438.300000 | 0.000000 | 3.000000 | 0.000000 | 510.000000 | 10.000000 | 0.000000 | 0.000000 |
| 25% | 7.130368e+08 | 41.000000 | 1.000000 | 31.000000 | 3.000000 | 2.000000 | 2.000000 | 2555.000000 | 359.000000 | 1324.500000 | 0.631000 | 2155.500000 | 45.000000 | 0.582000 | 0.023000 |
| 50% | 7.179264e+08 | 46.000000 | 2.000000 | 36.000000 | 4.000000 | 2.000000 | 2.000000 | 4549.000000 | 1276.000000 | 3474.000000 | 0.736000 | 3899.000000 | 67.000000 | 0.702000 | 0.176000 |
| 75% | 7.731435e+08 | 52.000000 | 3.000000 | 40.000000 | 5.000000 | 3.000000 | 3.000000 | 11067.500000 | 1784.000000 | 9859.000000 | 0.859000 | 4741.000000 | 81.000000 | 0.818000 | 0.503000 |
| max | 8.283431e+08 | 73.000000 | 5.000000 | 56.000000 | 6.000000 | 6.000000 | 6.000000 | 34516.000000 | 2517.000000 | 34516.000000 | 3.397000 | 18484.000000 | 139.000000 | 3.714000 | 0.999000 |
bank.isnull().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
ex.pie(bank,names='Attrition_Flag',title='Proportion of churn vs not churn customers',hole=0.33)
As we can see, only 16% of the data samples represent churn customers; in the following steps, I will use SMOTE to upsample the churn samples to match them with the regular customer sample size
#One Hot Encoding
bank.Attrition_Flag = bank.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})
bank.Gender = bank.Gender.replace({'F':1,'M':0})
bank= pd.concat([bank,pd.get_dummies(bank['Education_Level']).drop(columns=['Unknown'])],axis=1)
bank = pd.concat([bank,pd.get_dummies(bank['Income_Category']).drop(columns=['Unknown'])],axis=1)
bank = pd.concat([bank,pd.get_dummies(bank['Marital_Status']).drop(columns=['Unknown'])],axis=1)
bank = pd.concat([bank,pd.get_dummies(bank['Card_Category']).drop(columns=['Platinum'])],axis=1)
bank.drop(columns = ['Education_Level','Income_Category','Marital_Status','Card_Category','CLIENTNUM'],inplace=True)
Here we one hot encode all the categorical features describing different statuses of a customer.
bank
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | ... | $40K - $60K | $60K - $80K | $80K - $120K | Less than $40K | Divorced | Married | Single | Blue | Gold | Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | 0 | 3 | 39 | 5 | 1 | 3 | 12691.0 | 777 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 1 | 0 | 49 | 1 | 5 | 44 | 6 | 1 | 2 | 8256.0 | 864 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
| 2 | 0 | 51 | 0 | 3 | 36 | 4 | 1 | 0 | 3418.0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 3 | 0 | 40 | 1 | 4 | 34 | 3 | 4 | 1 | 3313.0 | 2517 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 0 | 40 | 0 | 3 | 21 | 5 | 1 | 0 | 4716.0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10122 | 0 | 50 | 0 | 2 | 40 | 3 | 2 | 3 | 4003.0 | 1851 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 10123 | 1 | 41 | 0 | 2 | 25 | 4 | 2 | 3 | 4277.0 | 2186 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 10124 | 1 | 44 | 1 | 1 | 36 | 5 | 3 | 4 | 5409.0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 10125 | 1 | 30 | 0 | 2 | 36 | 4 | 3 | 3 | 5281.0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 10126 | 1 | 43 | 1 | 2 | 25 | 6 | 2 | 4 | 10388.0 | 1961 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
10127 rows × 33 columns
correlation_matrix = bank.corr()
print(correlation_matrix)
Attrition_Flag Customer_Age Gender \
Attrition_Flag 1.000000 0.018203 0.037272
Customer_Age 0.018203 1.000000 0.017312
Gender 0.037272 0.017312 1.000000
Dependent_count 0.018991 -0.122254 -0.004563
Months_on_book 0.013687 0.788912 0.006728
Total_Relationship_Count -0.150005 -0.010931 -0.003157
Months_Inactive_12_mon 0.152449 0.054361 0.011163
Contacts_Count_12_mon 0.204491 -0.018452 -0.039987
Credit_Limit -0.023873 0.002476 -0.420806
Total_Revolving_Bal -0.263053 0.014780 -0.029658
Avg_Open_To_Buy -0.000285 0.001151 -0.418059
Total_Amt_Chng_Q4_Q1 -0.131063 -0.062042 -0.026712
Total_Trans_Amt -0.168598 -0.046446 -0.024890
Total_Trans_Ct -0.371403 -0.067097 0.067454
Total_Ct_Chng_Q4_Q1 -0.290054 -0.012143 0.005800
Avg_Utilization_Ratio -0.178410 0.007114 0.257851
College -0.007840 -0.014788 -0.002610
Doctorate 0.029386 0.025199 0.017631
Graduate -0.009046 -0.000203 0.006438
High School -0.011730 0.001199 -0.018361
Post-Graduate 0.011127 -0.022081 -0.009001
Uneducated -0.001444 0.005057 0.005174
$120K + 0.009584 0.044332 -0.294775
$40K - $60K -0.011688 -0.013804 0.034718
$60K - $80K -0.028221 -0.017869 -0.424892
$80K - $120K -0.003459 0.005381 -0.448017
Less than $40K 0.022466 -0.002573 0.580016
Divorced 0.000850 -0.042614 0.004726
Married -0.023735 0.047364 -0.011427
Single 0.019037 -0.011248 0.015756
Blue 0.003216 0.021409 0.085200
Gold 0.005973 -0.011901 -0.043453
Silver -0.008467 -0.019425 -0.072699
Dependent_count Months_on_book \
Attrition_Flag 0.018991 0.013687
Customer_Age -0.122254 0.788912
Gender -0.004563 0.006728
Dependent_count 1.000000 -0.103062
Months_on_book -0.103062 1.000000
Total_Relationship_Count -0.039076 -0.009203
Months_Inactive_12_mon -0.010768 0.074164
Contacts_Count_12_mon -0.040505 -0.010774
Credit_Limit 0.068065 0.007507
Total_Revolving_Bal -0.002688 0.008623
Avg_Open_To_Buy 0.068291 0.006732
Total_Amt_Chng_Q4_Q1 -0.035439 -0.048959
Total_Trans_Amt 0.025046 -0.038591
Total_Trans_Ct 0.049912 -0.049819
Total_Ct_Chng_Q4_Q1 0.011087 -0.014072
Avg_Utilization_Ratio -0.037135 -0.007541
College 0.003369 -0.010281
Doctorate -0.003368 0.024114
Graduate 0.000671 0.003531
High School -0.013127 0.002637
Post-Graduate 0.009459 -0.016703
Uneducated 0.002190 0.001099
$120K + 0.024831 0.033532
$40K - $60K -0.013493 -0.003204
$60K - $80K 0.028975 -0.016635
$80K - $120K 0.047611 0.007720
Less than $40K -0.046785 -0.004300
Divorced 0.006697 -0.027678
Married 0.014385 0.033194
Single -0.040707 -0.005065
Blue -0.027671 0.014934
Gold 0.027035 -0.005426
Silver 0.017322 -0.014357
Total_Relationship_Count Months_Inactive_12_mon \
Attrition_Flag -0.150005 0.152449
Customer_Age -0.010931 0.054361
Gender -0.003157 0.011163
Dependent_count -0.039076 -0.010768
Months_on_book -0.009203 0.074164
Total_Relationship_Count 1.000000 -0.003675
Months_Inactive_12_mon -0.003675 1.000000
Contacts_Count_12_mon 0.055203 0.029493
Credit_Limit -0.071386 -0.020394
Total_Revolving_Bal 0.013726 -0.042210
Avg_Open_To_Buy -0.072601 -0.016605
Total_Amt_Chng_Q4_Q1 0.050119 -0.032247
Total_Trans_Amt -0.347229 -0.036982
Total_Trans_Ct -0.241891 -0.042787
Total_Ct_Chng_Q4_Q1 0.040831 -0.038989
Avg_Utilization_Ratio 0.067663 -0.007503
College -0.013582 0.004038
Doctorate -0.009077 0.002432
Graduate 0.005397 0.005885
High School -0.001707 -0.005575
Post-Graduate 0.012050 -0.006240
Uneducated 0.008202 0.010127
$120K + -0.006090 -0.002661
$40K - $60K -0.008747 -0.021437
$60K - $80K 0.005475 -0.004616
$80K - $120K 0.001185 -0.005910
Less than $40K 0.005109 0.018848
Divorced 0.009276 0.001796
Married 0.017001 -0.007065
Single -0.017329 0.008973
Blue 0.085789 0.016565
Gold -0.055678 -0.003283
Silver -0.060566 -0.016034
Contacts_Count_12_mon Credit_Limit \
Attrition_Flag 0.204491 -0.023873
Customer_Age -0.018452 0.002476
Gender -0.039987 -0.420806
Dependent_count -0.040505 0.068065
Months_on_book -0.010774 0.007507
Total_Relationship_Count 0.055203 -0.071386
Months_Inactive_12_mon 0.029493 -0.020394
Contacts_Count_12_mon 1.000000 0.020817
Credit_Limit 0.020817 1.000000
Total_Revolving_Bal -0.053913 0.042493
Avg_Open_To_Buy 0.025646 0.995981
Total_Amt_Chng_Q4_Q1 -0.024445 0.012813
Total_Trans_Amt -0.112774 0.171730
Total_Trans_Ct -0.152213 0.075927
Total_Ct_Chng_Q4_Q1 -0.094997 -0.002020
Avg_Utilization_Ratio -0.055471 -0.482965
College -0.008996 0.001929
Doctorate -0.001016 -0.005195
Graduate 0.002660 -0.004844
High School -0.003927 -0.001432
Post-Graduate -0.006878 0.005879
Uneducated 0.012596 0.012213
$120K + 0.020744 0.339211
$40K - $60K 0.002336 -0.161605
$60K - $80K 0.000167 0.093808
$80K - $120K 0.011473 0.333828
Less than $40K -0.023255 -0.395233
Divorced -0.008389 0.022578
Married 0.001598 -0.056713
Single 0.007997 0.032309
Blue 0.000929 -0.515644
Gold -0.000685 0.234331
Silver -0.001059 0.441030
Total_Revolving_Bal ... $40K - $60K $60K - $80K \
Attrition_Flag -0.263053 ... -0.011688 -0.028221
Customer_Age 0.014780 ... -0.013804 -0.017869
Gender -0.029658 ... 0.034718 -0.424892
Dependent_count -0.002688 ... -0.013493 0.028975
Months_on_book 0.008623 ... -0.003204 -0.016635
Total_Relationship_Count 0.013726 ... -0.008747 0.005475
Months_Inactive_12_mon -0.042210 ... -0.021437 -0.004616
Contacts_Count_12_mon -0.053913 ... 0.002336 0.000167
Credit_Limit 0.042493 ... -0.161605 0.093808
Total_Revolving_Bal 1.000000 ... 0.005220 -0.003847
Avg_Open_To_Buy -0.047167 ... -0.162039 0.094133
Total_Amt_Chng_Q4_Q1 0.058174 ... -0.000867 0.022399
Total_Trans_Amt 0.064370 ... 0.000209 0.005502
Total_Trans_Ct 0.056060 ... 0.002801 -0.027517
Total_Ct_Chng_Q4_Q1 0.089861 ... -0.002159 0.014402
Avg_Utilization_Ratio 0.624022 ... 0.070035 -0.100068
College -0.011058 ... 0.003405 -0.007854
Doctorate -0.018208 ... -0.012193 -0.004764
Graduate -0.000356 ... 0.000061 -0.006836
High School 0.019276 ... -0.000524 0.020287
Post-Graduate 0.007068 ... 0.023301 0.007234
Uneducated -0.004446 ... -0.010118 -0.008775
$120K + 0.020264 ... -0.128862 -0.111479
$40K - $60K 0.005220 ... 1.000000 -0.185743
$60K - $80K -0.003847 ... -0.185743 1.000000
$80K - $120K 0.022776 ... -0.195852 -0.169433
Less than $40K -0.015974 ... -0.341238 -0.295207
Divorced -0.002368 ... 0.005728 0.004860
Married 0.039124 ... -0.006464 0.006952
Single -0.037794 ... 0.003745 -0.008723
Blue -0.022681 ... 0.007328 -0.037802
Gold 0.023974 ... -0.013388 0.034771
Silver 0.012799 ... 0.001025 0.024076
$80K - $120K Less than $40K Divorced Married \
Attrition_Flag -0.003459 0.022466 0.000850 -0.023735
Customer_Age 0.005381 -0.002573 -0.042614 0.047364
Gender -0.448017 0.580016 0.004726 -0.011427
Dependent_count 0.047611 -0.046785 0.006697 0.014385
Months_on_book 0.007720 -0.004300 -0.027678 0.033194
Total_Relationship_Count 0.001185 0.005109 0.009276 0.017001
Months_Inactive_12_mon -0.005910 0.018848 0.001796 -0.007065
Contacts_Count_12_mon 0.011473 -0.023255 -0.008389 0.001598
Credit_Limit 0.333828 -0.395233 0.022578 -0.056713
Total_Revolving_Bal 0.022776 -0.015974 -0.002368 0.039124
Avg_Open_To_Buy 0.331716 -0.393718 0.022786 -0.060209
Total_Amt_Chng_Q4_Q1 0.012905 -0.025276 -0.005613 0.052388
Total_Trans_Amt 0.009814 -0.008816 0.010392 -0.063030
Total_Trans_Ct -0.038939 0.040664 0.024863 -0.123911
Total_Ct_Chng_Q4_Q1 0.002922 -0.001399 -0.011503 0.013639
Avg_Utilization_Ratio -0.173606 0.273511 -0.018385 0.049684
College 0.019690 -0.007724 0.014065 -0.001214
Doctorate -0.015165 -0.000589 0.004920 -0.003584
Graduate 0.002308 0.017496 -0.004936 0.013413
High School 0.001987 -0.019091 -0.019568 0.008604
Post-Graduate 0.003490 -0.010762 0.004957 0.003768
Uneducated -0.006529 -0.000514 0.027912 -0.018026
$120K + -0.117547 -0.204804 -0.002483 0.013448
$40K - $60K -0.195852 -0.341238 0.005728 -0.006464
$60K - $80K -0.169433 -0.295207 0.004860 0.006952
$80K - $120K 1.000000 -0.311274 -0.010926 0.013568
Less than $40K -0.311274 1.000000 -0.007134 -0.008341
Divorced -0.010926 -0.007134 1.000000 -0.262132
Married 0.013568 -0.008341 -0.262132 1.000000
Single -0.020703 0.018028 -0.225502 -0.741185
Blue -0.038508 0.069699 -0.001440 0.051688
Gold 0.008843 -0.032630 -0.012659 -0.023612
Silver 0.039775 -0.059205 0.008305 -0.044260
Single Blue Gold Silver
Attrition_Flag 0.019037 0.003216 0.005973 -0.008467
Customer_Age -0.011248 0.021409 -0.011901 -0.019425
Gender 0.015756 0.085200 -0.043453 -0.072699
Dependent_count -0.040707 -0.027671 0.027035 0.017322
Months_on_book -0.005065 0.014934 -0.005426 -0.014357
Total_Relationship_Count -0.017329 0.085789 -0.055678 -0.060566
Months_Inactive_12_mon 0.008973 0.016565 -0.003283 -0.016034
Contacts_Count_12_mon 0.007997 0.000929 -0.000685 -0.001059
Credit_Limit 0.032309 -0.515644 0.234331 0.441030
Total_Revolving_Bal -0.037794 -0.022681 0.023974 0.012799
Avg_Open_To_Buy 0.035691 -0.513502 0.232132 0.439790
Total_Amt_Chng_Q4_Q1 -0.043323 -0.005710 0.006219 0.002621
Total_Trans_Amt 0.044772 -0.194374 0.103986 0.154983
Total_Trans_Ct 0.098586 -0.130755 0.076399 0.100971
Total_Ct_Chng_Q4_Q1 -0.009774 0.005433 -0.002728 -0.003066
Avg_Utilization_Ratio -0.030493 0.214006 -0.085041 -0.190051
College -0.005681 -0.005064 0.010505 0.003591
Doctorate 0.006284 0.003367 -0.000747 -0.005714
Graduate -0.009162 -0.013194 0.000342 0.012746
High School -0.000899 0.012123 0.006841 -0.013394
Post-Graduate -0.010966 -0.008533 -0.003842 0.007342
Uneducated 0.004022 0.006045 -0.015817 0.000621
$120K + -0.007109 -0.049146 0.034771 0.033880
$40K - $60K 0.003745 0.007328 -0.013388 0.001025
$60K - $80K -0.008723 -0.037802 0.034771 0.024076
$80K - $120K -0.020703 -0.038508 0.008843 0.039775
Less than $40K 0.018028 0.069699 -0.032630 -0.059205
Divorced -0.225502 -0.001440 -0.012659 0.008305
Married -0.741185 0.051688 -0.023612 -0.044260
Single 1.000000 -0.040122 0.024426 0.031060
Blue -0.040122 1.000000 -0.397782 -0.889816
Gold 0.024426 -0.397782 1.000000 -0.025920
Silver 0.031060 -0.889816 -0.025920 1.000000
[33 rows x 33 columns]
plt.figure(figsize=(20,20))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Pearson Correlation Heatmap')
plt.show()
# Separate features (X) and the target variable (y)
X = bank[bank.columns[1:]] # Features
y = bank[bank.columns[0]] # Target variable
x_train, x_test, y_train, y_test = train_test_split(X, bank['Attrition_Flag'], test_size = 0.2, random_state=1234)
# Initialize and train a logistic regression model
logistic_regression = LogisticRegression(max_iter=1000) # You can adjust 'max_iter' as needed
logistic_regression.fit(x_train, y_train)
# Make predictions on the test set
y_pred = logistic_regression.predict(x_test)
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Calculate the recall score
recall = recall_score(y_test, y_pred)
# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)
# Evaluate the model's performance (accuracy, classification report)
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)
# Print the results
print("Recall:", recall)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report_str)
Confusion Matrix:
[[1655 52]
[ 152 167]]
Recall: 0.5235109717868338
Accuracy: 0.8993089832181639
Classification Report:
precision recall f1-score support
0 0.92 0.97 0.94 1707
1 0.76 0.52 0.62 319
accuracy 0.90 2026
macro avg 0.84 0.75 0.78 2026
weighted avg 0.89 0.90 0.89 2026
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=['Predicted 0', 'Predicted 1'],
yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
Recall reveals the proportion of churns identified correctly by the total number of churns.In this model we have achieved a recall rate of 50%.
# Initialize and apply SMOTE to oversample the minority class
oversample = SMOTE()
X_resampled, y_resampled = oversample.fit_resample(X, y)
# Create a new DataFrame with the resampled data
resampled_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='Churn')], axis=1)
correlation_matrix=resampled_df.corr()
plt.figure(figsize=(20,20))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Pearson Correlation Heatmap')
plt.show()
resampled_df
| Customer_Age | Gender | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | ... | $60K - $80K | $80K - $120K | Less than $40K | Divorced | Married | Single | Blue | Gold | Silver | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | 0 | 3 | 39 | 5 | 1 | 3 | 12691.000000 | 777 | 11914.000000 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 1 | 49 | 1 | 5 | 44 | 6 | 1 | 2 | 8256.000000 | 864 | 7392.000000 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| 2 | 51 | 0 | 3 | 36 | 4 | 1 | 0 | 3418.000000 | 0 | 3418.000000 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 3 | 40 | 1 | 4 | 34 | 3 | 4 | 1 | 3313.000000 | 2517 | 796.000000 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 40 | 0 | 3 | 21 | 5 | 1 | 0 | 4716.000000 | 0 | 4716.000000 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16995 | 38 | 1 | 3 | 33 | 4 | 2 | 1 | 4046.834158 | 0 | 4046.834158 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| 16996 | 56 | 1 | 1 | 51 | 3 | 4 | 2 | 3312.043325 | 881 | 2430.263996 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 16997 | 43 | 0 | 2 | 32 | 4 | 3 | 1 | 1438.300000 | 644 | 793.666319 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 16998 | 50 | 0 | 0 | 41 | 2 | 3 | 1 | 3427.622846 | 0 | 3427.622846 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 |
| 16999 | 47 | 0 | 2 | 38 | 1 | 3 | 1 | 32037.279882 | 1181 | 30855.383056 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 |
17000 rows × 33 columns
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
# Initialize the logistic regression model
logistic_reg = LogisticRegression()
# Fit the model on the training data
logistic_reg.fit(X_train, y_train)
# Predict the target values on the testing data
y_pred = logistic_reg.predict(X_test)
# Evaluate the model
classification_report(y_test, y_pred)
# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Calculate the recall score
recall = recall_score(y_test, y_pred)
# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)
# Evaluate the model's performance (accuracy, classification report)
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)
# Print the results
print("Accuracy:", accuracy)
print("Recall:",recall)
print("Classification Report:\n", classification_report_str)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=['Predicted 0', 'Predicted 1'],
yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
C:\Users\junej\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Confusion Matrix:
[[1313 363]
[ 348 1376]]
Accuracy: 0.7908823529411765
Recall: 0.7981438515081206
Classification Report:
precision recall f1-score support
0 0.79 0.78 0.79 1676
1 0.79 0.80 0.79 1724
accuracy 0.79 3400
macro avg 0.79 0.79 0.79 3400
weighted avg 0.79 0.79 0.79 3400
After using SMOTE (SMOTE selects k-nearest neighbors from the same class. It then generates synthetic examples by interpolating between the selected instance and its k-nearest neighbors. )we have achived a recall rate of 70%.
For this analysis, I will be clustering credit card users based on their card activity, and then I will dig a little deeper to see what other attributes may be unique to each group.
cols = uncoded_bank.iloc[:, 9:].select_dtypes(['uint8', 'int64', 'float64']).columns
cols
Index(['Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
dtype='object')
k_values = []
silhouette_scores=[]
for i in range(2,16):
km = KMeans(n_clusters=i, init='k-means++', max_iter=500, n_init=10, random_state=1)
data = StandardScaler().fit_transform(uncoded_bank[cols])
km.fit(data)
k_values.append(km.inertia_)
silhouette_scores.append(silhouette_score(data, km.labels_))
# Print K values and corresponding Silhouette Scores
for k, score in zip(k_values, silhouette_scores):
print(f"K={k}, Silhouette Score={score}")
K=103753.61777446116, Silhouette Score=0.20650462503448958 K=92137.3160829283, Silhouette Score=0.13826893294481118 K=82913.83055674178, Silhouette Score=0.1579085689998887 K=77909.45717622529, Silhouette Score=0.16047194400764273 K=74096.81242118758, Silhouette Score=0.13579879293950167 K=71151.9786183667, Silhouette Score=0.12264673133282393 K=68748.96808991095, Silhouette Score=0.12155307422987284 K=66469.25033778106, Silhouette Score=0.11760435326006441 K=64466.90538157148, Silhouette Score=0.11692241034233193 K=62778.65613839377, Silhouette Score=0.11643201103408396 K=61221.66970579319, Silhouette Score=0.11764345360079785 K=59967.26443416673, Silhouette Score=0.11682344276252803 K=58612.71350106147, Silhouette Score=0.11479535312915867 K=57686.826737998366, Silhouette Score=0.11367270215947615
fig = plt.figure(figsize=(10,8))
ax = sns.lineplot(x=range(2,16), y=k_values)
ax.set_title('Elbow Method')
ax.title.set_size(20)
plt.xlabel('K Clusters')
plt.ylabel('Within-cluster Sums of Squares')
plt.show()
It looks like the optimal number of clusters is somewhere between 4 and 6. After trying each number, 6 clusters returned the highest Silhouette Score, so I decided to proceed with 6 clusters. The Silhoutte score is a measure of how well defined clusters are, with scores near 1 indicating well-defined clusters, and scores near 0 indicating overlapping clusters. Now let's create a pipeline that will first process our data and then create a KMeans model.
For preprocessing, we will first normalize our data, and then perform Principal Component Analysis.
Principal Component Analysis, or PCA, is a method that compresses the information found in the explanatory variables into a reduced number of new variables. This reduces the noise and dimensionality of the data while preserving most of the original information.
preprocessor = Pipeline(
[
("scaler", StandardScaler()),
("pca", PCA(n_components=2, random_state=1))
]
)
clusterer = Pipeline(
[
(
"kmeans",
KMeans(
n_clusters=6,
init="k-means++",
n_init=50,
max_iter=1000,
random_state=1
)
)
]
)
pipe = Pipeline(
[
("preprocessor", preprocessor),
("clusterer", clusterer)
]
)
# Fit the pipeline to your data
# Assuming you have a dataset 'X' to fit the pipeline on
pipe.fit(uncoded_bank[cols])
# Access the explained variance ratios
explained_variance_ratios = pipe.named_steps['preprocessor'].named_steps['pca'].explained_variance_ratio_
# Print the variance explained by both components
print('Variance Explained by Component 1:', explained_variance_ratios[0])
print('Variance Explained by Component 2:', explained_variance_ratios[1])
Variance Explained by Component 1: 0.21305681674068852 Variance Explained by Component 2: 0.16749519637408963
PCA1 and PCA2 explain 38% of variance, now let's add the cluster labels to our data and see how well our clusters are defined.PCA can help in improving the separation of clusters by emphasizing the directions of maximum variance. This can potentially lead to better-defined clusters.
preprocessed_data = pipe["preprocessor"].transform(uncoded_bank[cols])
predicted_labels = pipe["clusterer"]["kmeans"].labels_
pcadf = pd.DataFrame(
pipe["preprocessor"].transform(uncoded_bank[cols]),
columns=["component_1", "component_2"]
)
pcadf["predicted_cluster"] = pipe["clusterer"]["kmeans"].labels_
plt.figure(figsize=(8, 8))
sp = sns.scatterplot(
x="component_1",
y="component_2",
s=50,
data=pcadf,
hue="predicted_cluster",
palette='tab10'
)
sp.set_title("Clustering results from credit card use")
ax.title.set_size(20)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()
As we saw from the Silhouette score, there is clearly some overlap between clusters, but overall there appears to be several distinct clusters in the data.
uncoded_bank['cluster'] = pipe['clusterer']['kmeans'].labels_
uncoded_bank.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | 0 | 45 | 0 | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 3 |
| 1 | 818770008 | 0 | 49 | 1 | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 1 |
| 2 | 713982108 | 0 | 51 | 0 | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 1 |
| 3 | 769911858 | 0 | 40 | 1 | 4 | High School | Unknown | Less than $40K | Blue | 34 | ... | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 1 |
| 4 | 709106358 | 0 | 40 | 0 | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | 1 |
5 rows × 22 columns
fig, axs = plt.subplots(figsize=(12,8))
ax = uncoded_bank['cluster'].value_counts().plot.pie(autopct='%1.0f%%')
ax.set_ylabel('')
ax.set_title('Cluster Composition')
ax.title.set_size(20)
plt.show()
fig, axs = plt.subplots(ncols=2,nrows=3, figsize=(20,14))
uncoded_bank[uncoded_bank['cluster'] == 0]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[0,0], title='Group 1')
uncoded_bank[uncoded_bank['cluster'] == 1]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[0,1], title='Group 2')
uncoded_bank[uncoded_bank['cluster'] == 2]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[1,0], title='Group 3')
uncoded_bank[uncoded_bank['cluster'] == 3]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[1,1], title='Group 4')
uncoded_bank[uncoded_bank['cluster'] == 4]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[2,0], title='Group 5')
uncoded_bank[uncoded_bank['cluster'] == 5]['Attrition_Flag'].value_counts().sort_index().plot.barh(ax=axs[2,1], title='Group 6')
plt.show()
It looks like Groups 2 and 4 have a significantly higher rate of customer attrition, while Group 1 has a significantly lower rate of attrition.
scaler = StandardScaler()
scaled_cc = pd.DataFrame(scaler.fit_transform(uncoded_bank[cols]), columns=cols)
scaled_cc['cluster'] = pipe['clusterer']['kmeans'].labels_
scaled_cc.head()
| Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.384621 | 0.763943 | -1.327136 | 0.492404 | 0.446622 | -0.473422 | 0.488971 | 2.623494 | -0.959707 | -0.973895 | 3.834003 | -0.775882 | 3 |
| 1 | 1.010715 | 1.407306 | -1.327136 | -0.411616 | -0.041367 | -0.366667 | -0.008486 | 3.563293 | -0.916433 | -1.357340 | 12.608573 | -0.616276 | 1 |
| 2 | 0.008965 | 0.120579 | -1.327136 | -2.219655 | -0.573698 | -1.426858 | -0.445658 | 8.367214 | -0.740982 | -1.911206 | 6.807864 | -0.997155 | 1 |
| 3 | -0.241473 | -0.522785 | 1.641478 | -1.315636 | -0.585251 | 1.661686 | -0.734100 | 2.942843 | -0.951758 | -1.911206 | 6.807864 | 1.759686 | 1 |
| 4 | -1.869317 | 0.763943 | -1.327136 | -2.219655 | -0.430877 | -1.426858 | -0.302868 | 6.455682 | -1.056263 | -1.570365 | 7.509325 | -0.997155 | 1 |
ix, axs = plt.subplots(ncols=2,nrows=3, figsize=(20,32))
scaled_cc[scaled_cc['cluster'] == 0][cols].mean().plot.barh(ax=axs[0,0], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Group 1')
scaled_cc[scaled_cc['cluster'] == 1][cols].mean().plot.barh(ax=axs[0,1], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Group 2')
scaled_cc[scaled_cc['cluster'] == 2][cols].mean().plot.barh(ax=axs[1,0], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Group 3')
scaled_cc[scaled_cc['cluster'] == 3][cols].mean().plot.barh(ax=axs[1,1], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Group 4')
scaled_cc[scaled_cc['cluster'] == 4][cols].mean().plot.barh(ax=axs[2,0], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Group 5')
scaled_cc[scaled_cc['cluster'] == 5][cols].mean().plot.barh(ax=axs[2,1], xlim=(-1.5, 3), figsize=(20,20), sharey=True, title='Group 6')
plt.show()
Looking at the charts above, we can see the groups have the following characteristics:
Group 1: High transaction count and amount, higher balance, low relationship count, lower contact count
Group 2: Low transaction count and amount, lower count and amount change from Q4 to Q1, high contact count, higher relationship count, higher months on book, higher months inactive, lower balance
Group 3: High utilization ratio, higher count and amount change from Q4 to Q1, low avg open to buy, higher balance, low credit limit
Group 4: Low transaction count and amount, low utilization ratio, lower count and amount change from Q4 to Q1, high contact count, higher relationship count, lower balance
Group 5: Lower utilization ratio, higher transaction count, higher transaction count change from Q1 to Q4, low balance, low credit limit
Group 6: High transaction count and amount, high avg open to buy, high credit limit, low relationship count, low utilization ratio
Based on the results above, I decided to give the following names to each of the groups:
Group 1: Higher balance heavy users
Group 2: Long-time very light users
Group 3: Low limit, high balance light users
Group 4: High limit, low balance light users
Group 5: Low limit, low balance light users
Group 6: High limit, low balance heavy users
fig, axs = plt.subplots(ncols=2,nrows=3, figsize=(12,14))
sns.histplot(uncoded_bank[uncoded_bank['cluster'] == 0]['Customer_Age'], ax=axs[0,0], binrange=(20,70), binwidth=5).set(title='Higher balance heavy users', ylabel='', xlabel='')
sns.histplot(uncoded_bank[uncoded_bank['cluster'] == 1]['Customer_Age'], ax=axs[0,1], binrange=(20,70), binwidth=5).set(title='Long-time very light users', ylabel='', xlabel='')
sns.histplot(uncoded_bank[uncoded_bank['cluster'] == 2]['Customer_Age'], ax=axs[1,0], binrange=(20,70), binwidth=5).set(title='Low limit, high balance light users', ylabel='', xlabel='')
sns.histplot(uncoded_bank[uncoded_bank['cluster'] == 3]['Customer_Age'], ax=axs[1,1], binrange=(20,70), binwidth=5).set(title='High limit, low balance light users', ylabel='', xlabel='')
sns.histplot(uncoded_bank[uncoded_bank['cluster'] == 4]['Customer_Age'], ax=axs[2,0], binrange=(20,70), binwidth=5).set(title='Low limit, low balance light users', ylabel='', xlabel='')
sns.histplot(uncoded_bank[uncoded_bank['cluster'] == 5]['Customer_Age'], ax=axs[2,1], binrange=(20,70), binwidth=5).set(title='High limit, low balance heavy users', ylabel='', xlabel='')
plt.show()
It looks like the majority of users are in their 40's and 50's for all groups. Long-time light users have a higher number of users in their 60's compared to other groups. Long-time light users and Low-limit high balance light users seem to have a higher number of users in their 30's.
fix, axs = plt.subplots(ncols=2,nrows=3, figsize=(12,14))
uncoded_bank[uncoded_bank['cluster'] == 0]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[0,0], ylabel='', title='Higher balance heavy users', autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 1]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[0,1], ylabel='', title='Long-time very light users', autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 2]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[1,0], ylabel='', title='Low limit, high balance light users', autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 3]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[1,1], ylabel='', title='High limit, low balance light users', autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 4]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[2,0], ylabel='', title='Low limit, low balance light users', autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 5]['Gender'].value_counts().sort_index(ascending=False).plot.pie(ax=axs[2,1], ylabel='', title='High limit, low balance heavy users', autopct='%.1f%%')
plt.show()
fix, axs = plt.subplots(ncols=2,nrows=3, figsize=(12,14))
uncoded_bank[uncoded_bank['cluster'] == 0]['Dependent_count'].value_counts().sort_index().plot.bar(ax=axs[0,0], title='Higher balance heavy users')
uncoded_bank[uncoded_bank['cluster'] == 1]['Dependent_count'].value_counts().sort_index().plot.bar(ax=axs[0,1], title='Long-time very light users')
uncoded_bank[uncoded_bank['cluster'] == 2]['Dependent_count'].value_counts().sort_index().plot.bar(ax=axs[1,0], title='Low limit, high balance light users')
uncoded_bank[uncoded_bank['cluster'] == 3]['Dependent_count'].value_counts().sort_index().plot.bar(ax=axs[1,1], title='High limit, low balance light users')
uncoded_bank[uncoded_bank['cluster'] == 4]['Dependent_count'].value_counts().sort_index().plot.bar(ax=axs[2,0], title='Low limit, low balance light users')
uncoded_bank[uncoded_bank['cluster'] == 5]['Dependent_count'].value_counts().sort_index().plot.bar(ax=axs[2,1], title='High limit, low balance heavy users')
plt.show()
fix, axs = plt.subplots(ncols=2,nrows=3, figsize=(12,14))
uncoded_bank[uncoded_bank['cluster'] == 0]['Education_Level'].value_counts().sort_index().plot.pie(ax=axs[0,0], ylabel='', title='Higher balance heavy users', colors=['r','b','g','y','orange', 'gray','pink'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 1]['Education_Level'].value_counts().sort_index().plot.pie(ax=axs[0,1], ylabel='', title='Long-time very light users', colors=['r','b','g','y','orange', 'gray','pink'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 2]['Education_Level'].value_counts().sort_index().plot.pie(ax=axs[1,0], ylabel='', title='Low limit, high balance light users', colors=['r','b','g','y','orange', 'gray','pink'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 3]['Education_Level'].value_counts().sort_index().plot.pie(ax=axs[1,1], ylabel='', title='High limit, low balance light users', colors=['r','b','g','y','orange', 'gray','pink'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 4]['Education_Level'].value_counts().sort_index().plot.pie(ax=axs[2,0], ylabel='', title='Low limit, low balance light users', colors=['r','b','g','y','orange', 'gray','pink'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 5]['Education_Level'].value_counts().sort_index().plot.pie(ax=axs[2,1], ylabel='', title='High limit, low balance heavy users', colors=['r','b','g','y','orange', 'gray','pink'], autopct='%.1f%%')
plt.show()
There doesn't seem to be any significant differences in education between groups.
fix, axs = plt.subplots(ncols=2,nrows=3, figsize=(12,14))
uncoded_bank[uncoded_bank['cluster'] == 0]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[0,0], ylabel='', title='Higher balance heavy users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 1]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[0,1], ylabel='', title='Long-time very light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 2]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[1,0], ylabel='', title='Low limit, high balance light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 3]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[1,1], ylabel='', title='High limit, low balance light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 4]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[2,0], ylabel='', title='Low limit, low balance light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 5]['Marital_Status'].value_counts().sort_index().plot(kind='pie', ax=axs[2,1], ylabel='', title='High limit, low balance heavy users', colors=['r','b','g','y'], autopct='%.1f%%')
plt.show()
It looks like card limit and income are significantly correlated, with at least 60% of users in the high limit groups making at least 80K, and at least 50% of users in the low limit and high balance groups making less than 60K.
fix, axs = plt.subplots(ncols=2,nrows=3, figsize=(12,14))
uncoded_bank[uncoded_bank['cluster'] == 0]['Card_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[0,0], ylabel='', title='Higher balance heavy users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 1]['Card_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[0,1], ylabel='', title='Long-time very light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 2]['Card_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[1,0], ylabel='', title='Low limit, high balance light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 3]['Card_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[1,1], ylabel='', title='High limit, low balance light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 4]['Card_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[2,0], ylabel='', title='Low limit, low balance light users', colors=['r','b','g','y'], autopct='%.1f%%')
uncoded_bank[uncoded_bank['cluster'] == 5]['Card_Category'].value_counts().sort_index().plot(kind='pie', ax=axs[2,1], ylabel='', title='High limit, low balance heavy users', colors=['r','b','g','y'], autopct='%.1f%%')
plt.show()
As expected, Silver, Gold, and Platinum card holders mostly fall into the high limit groups.
Conclusion Based on the our model, we clustered the credit card users into the following groups:
Group 1: Higher balance heavy users
Group 2: Long-time very light users
Group 3: Low limit, high balance light users
Group 4: High limit, low balance light users
Group 5: Low limit, low balance light users
Group 6: High limit, low balance heavy users
Looking at the demographic data of each group, we also found the following:
Long-time light users are the most likely to turnover.
Most users are in their 40's and 50's.
High limit users are mostly male.
The ratio of married users is higher among Long-time light users.
High limit users have significantly higher income than other groups, and are more likely to carry premium cards.
Through this analysis we have gained a good understanding of the groups that exist within the customers, as well as some of the attributes of each group.